# -*- coding: utf-8 -*-
"""
For preprocessing the CDC data (by Race).

"""

import sys
import argparse
import codecs
import random
from collections import defaultdict


random.seed(1776)



# x <- inputDataSet$X
# t <- inputDataSet$Y
# n <- inputDataSet$N
#
# b <- inputDataSet$W1
# W2

# x - male
# t - death
# n - population
# W1 - proportion male deaths
# W2 - proportion female deaths

def new_county_info():
    return {"totalT": 0.0, "totalNotT": 0.0, "N": 0.0, "W1": 0.0, "W2": 0.0, "totalX": 0, "totalNotX": 0}

def to_float_or_default_field_value(field_value):
    if field_value == "Suppressed":
        return 0.0
    return float(field_value)

def dd(n, d):
    """
    default division
    :return: division, else 0.0
    """
    return n / d if d else 0.0


def get_csv_lines(filepath_with_name):

    lines = []
    ctr = 0
    counties = defaultdict(int)
    county_info = new_county_info()
    lines.append(",".join(["X", "notX", "T", "notT", "W1", "W2", "N"]) + "\n")
    with codecs.open(filepath_with_name, encoding="utf-8") as f:
        for line in f:
            if line.strip() == '"---"':
                break
            if ctr > 0:  # skip header
                line = line.strip().split("\t")
                assert len(line) == 7
                county_code = line[0]
                race = line[2].strip('"')
                race_code = line[3].strip('"')
                deaths = to_float_or_default_field_value(line[4])
                pop = to_float_or_default_field_value(line[5])
                counties[county_code] += 1

                if race == "Black or African American":
                    assert race_code == "2054-5"
                    county_info["totalX"] = pop
                    county_info["totalT"] += deaths
                    county_info["totalNotT"] += (pop - deaths)
                    county_info["N"] += pop
                    county_info["W1"] = dd(deaths, pop)
                elif race == "White":
                    assert race_code == "2106-3"
                    county_info["totalNotX"] = pop
                    county_info["totalT"] += deaths
                    county_info["totalNotT"] += (pop - deaths)
                    county_info["N"] += pop
                    county_info["W2"] = dd(deaths, pop)
                else:
                    assert False

                if counties[county_code] == 1:
                    pass
                elif counties[county_code] == 2:
                    assert county_info["totalX"] + county_info["totalNotX"] == county_info["N"]
                    #["X", "notX", "T", "notT", "W1", "W2", "N"])
                    formatted_line = [
                    dd(county_info["totalX"], county_info["N"]),
                     dd(county_info["totalNotX"], county_info["N"]),
                     dd(county_info["totalT"], county_info["N"]),
                     dd(county_info["totalNotT"], county_info["N"]),
                     county_info["W1"], county_info["W2"], county_info["N"]
                    ]

                    lines.append(",".join(["%f" % val for val in formatted_line]) + "\n")
                    county_info = new_county_info()
                else:
                    assert False
            ctr += 1

    return lines


def save_lines(filename_with_path, list_of_lists):   
    with codecs.open(filename_with_path, "w", encoding="utf-8") as f:
        f.writelines(list_of_lists)


                                 
def main(arguments):

    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--input_file', help="input_file")
    parser.add_argument('--output_file', help="output_file")

    
    args = parser.parse_args(arguments)

    input_file = args.input_file
    output_file = args.output_file

    lines = get_csv_lines(input_file)

    save_lines(output_file, lines)


if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))

